The following report describes both my approach, method, and results to address the Intrum – Home Quiz Challenge, when exploring and preparing data for modelling.
This is the first notebook where I worked on the data preparation and data exploration
import os
print("My current working directory is", os.getcwd())
config_input = '/Users/jackyb/Projects/Lyon_Ds/CreditRiskPayment_JB/data/input/'
config_output = '/Users/jackyb/Projects/Lyon_Ds/CreditRiskPayment_JB/data/processed/'
DATABASE = config_input + "dataset.db"
DATABASE
import sqlite3
from sqlite3 import Error
import pandas
import numpy
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from pandas import Timestamp
from matplotlib.dates import DateFormatter
import matplotlib.dates as mdates
from pandas.plotting import scatter_matrix
from collections import Counter
from scipy.stats import pearsonr
import missingno as msno
from sklearn.impute import SimpleImputer
import missingno as msno
import warnings
pandas.set_option('display.max_columns', 1000)
pandas.set_option('display.max_rows', 1000)
warnings.filterwarnings('ignore')
#general
PERCENTILE = [.25, .50, .75]
INCLUDE = ['object', 'float', 'int']
NUMERICS = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
#palette colors
SETCOL = ["windows blue", "amber", "greyish", "faded green", "dusty purple"]
SETCOL1 = ["#6dccda", "#cdcc5d", "#a2a2a2", "#ed97ca", "#a8786e", "#ad8bc9", "#ed665d", "#67bf5c", "#ff9e4a", "#729ece"]
SETCOL2 = ["#ed665d", "#67bf5c", "#ff9e4a", "#729ece", "#ad8bc9"]
SETCOL3 = [ "#729ece", "#67bf5c", "#ff9e4a", "#ad8bc9", "#ed665d"]
SETCOL4 = [ "#6dccda", "#ed97ca", "#729ece", "#ad8bc9", "#ff9e4a"]
def create_connection(path:str):
"""
Create a connection with the database
"""
connection = None
try:
connection = sqlite3.connect(path)
print("Connection to SQLite DB successful")
except Error as e:
print(f"The error '{e}' occurred")
return connection
def show_tables(con):
"""
Show tables on the database
"""
cursor = con.cursor()
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
return (cursor.fetchall())
def get_general_information_of_the_data(df):
"""
Expose the general information from the original dataframe
:param df: dataframe
:return: statistics on the dataframe
"""
print("----------------------------------------------------------------")
print("General statistic of the data: \n")
return df.describe(percentiles=PERCENTILE, include='all')
def check_df_duplicates(df):
"""
Check for duplicates in the dataset
:param df: dataframe
:return: dataframe with duplicate rows
"""
print("\n----------------------------------------------------------------")
print("Checking duplicates: \n")
duplicateRowsDF = df[df.duplicated()]
if len(duplicateRowsDF.index) != 0:
print("Duplicate Rows except first occurrence based on all columns are :")
print(duplicateRowsDF)
print("\n")
return duplicateRowsDF
else:
print("Do not exits rows duplicate in the dataset")
def columns_as_list(df):
"""
Variables as a list by type
:param df: dataframe
:return: three list of variables
"""
all_var = list(df.columns)
print("----------------------------------------------------------------")
print("List of all variables: \n")
print(all_var)
obj_var = list(df.select_dtypes(['object']).columns)
if obj_var:
print("----------------------------------------------------------------")
print("List of object variabless: \n")
print(obj_var)
num_var = list(df.select_dtypes(include=NUMERICS))
if num_var:
print("----------------------------------------------------------------")
print("List of numeric variable: \n")
print(num_var)
return all_var, obj_var, num_var
def match_column_name(df, varcode_search:str):
"""
Filter a dataframe by given a column
"""
match = datasetdf.loc[datasetdf['varcode'] == varcode_search]['name'].values
print('varcode: ', varcode_search, ' - meaning: ', match)
con = create_connection(DATABASE)
tables_on_database = show_tables(con)
tables_on_database
metadatadf = pandas.read_sql_query('SELECT * FROM dataset;', con)
datasetdf = pandas.read_sql_query('SELECT * FROM metadata;', con)
match_column_name(datasetdf, 'ap090')
datasetdf
metadatadf.head(5)
get_general_information_of_the_data(metadatadf)
sns.set(font_scale=2)
sns.pairplot(metadatadf, hue='ct090', palette = 'Set1')
The first look at the data shows NaN values, a few categorical variables, outlayers, some variables that are hight correlated.
It will be tackle down some of this conditions to prepare the data for modelling.
metadatadf['kdate_ori'] = pandas.to_datetime(metadatadf['keydate'])
metadatadf['kdate'] = metadatadf['kdate_ori'].map(Timestamp.date)
metadatadf['kdate'] = pandas.to_datetime(metadatadf['kdate'])
metadatadf['month'] = metadatadf['kdate'].dt.month
print (metadatadf['kdate'].min())
print (metadatadf['kdate'].max())
Observations:
I am aware that this input it is not the precise date on the register of the sample point, either the payment day, but give a estimation on period + 90days on when the debt was pay or not pay, from the event tracked on the case. I will give a try it to this variable because as human beings we have some "patterns", I mean, it is possible that someone pay in months where the general spences are low. This hypothesis will depend on the country. For example, USA or Canada are pretty defined by increasing family expenses on Hallowing. If debtor are "retails" it might have an inpact on the result on the result of debt collection. Also, the strategy could consider this points.
metadatadf.info()
all_var_metadf, obj_var_metadf, num_var_metadf = columns_as_list(metadatadf)
# Number of each type of column
plt.figure(figsize=(8, 4), dpi=80)
metadatadf.dtypes.value_counts().sort_values().plot(kind='barh', color=['#6dccda', '#cdcc5d', '#a2a2a2','#ed97ca' ])
plt.title('Number of columns - Data Types',fontsize=18)
plt.xlabel('Number of columns',fontsize=16)
plt.ylabel('Data type',fontsize=16)
A way to check the nulls values is defining a function to count it and to extract a dataframe the variables with a higher proportion of null values, then we can have a better idea of the magnitud of NaN on the dataset.
def counting_nullval (df):
nanval_perct = ((100 * df.isnull().sum()) / len(df))
nanval = df.isnull().sum()
#creating a table
nanvaltable = pandas.concat([nanval,nanval_perct], axis =1)
nanvaltable = nanvaltable.rename(columns = \
{0 : 'Sum_MissingValues', 1 : 'Perc_MissingValues' })
#Now that we got a table with this information,
#I am going to proceed to extract only the variables with null values
filtered_null_data = nanvaltable[nanvaltable.Perc_MissingValues != 0].sort_values(\
'Perc_MissingValues', ascending=False).round({'Perc_MissingValues':3})
return filtered_null_data
missing_values = counting_nullval(metadatadf)
missing_values.style.background_gradient(cmap='Reds')
msno.bar(metadatadf)
msno.matrix(metadatadf)
sorted = metadatadf.sort_values('d2112')
msno.matrix(sorted)
Observations:
The variables with a NaN values higher than 60% will be drop of the dataset, any manipulation on this variables can add bias to the model. Variables b007, d2112 and d1205 might be related due to the high correlation on the number of missing values.
Actions:
A data visualitation will be performance to find the posibles issues on data that can affect the models. The decisions on how to handle NaN or how to deal with outalyer, will be take depending on the variable.
df_c = metadatadf.copy()
def displot_with_stats(df,col,color):
"""build a histogram and check distribution"""
sns.set_palette(SETCOL1)
sns.set(font_scale=1.3)
fig, ax = plt.subplots(figsize=(8, 4))
sns.set_style('whitegrid')
sns.distplot(df[col], bins=150, kde=False, color= color)
plt.show()
print(f"Valores únicos: {df[col].describe(percentiles = PERCENTILE, include = INCLUDE)}")
print(f"Número de nulos: {df[col].isna().sum()}")
print(f"Porcentaje de nulos: {round((((df[col].isna().sum())/df[col].size)*100), 2)}")
def box_plot_visual (df, col_x, col_y):
sns.set_style("darkgrid")
fig, ax = plt.subplots(figsize=(8,4))
sns.set(font_scale=1.5)
box_plot = sns.boxplot(x = col_x, y = col_y, data=df, palette=SETCOL1)
def box_plot_visual_list (df, list_col):
for i, col in enumerate(list_col):
sns.set_style("darkgrid")
fig, ax = plt.subplots(figsize=(8,4))
sns.set(font_scale=1.5)
box_plot = sns.boxplot(x = 'month', y = col, data=df, palette=SETCOL1)
print ('********** Summary Nulls values **********')
print('\n')
print ("The raw data load has " + str(metadatadf.shape[1]) + " columns.\n"
"There are " + str(missing_values.shape[0]) +
" columns that have missing values. ")
print('\n')
nulls_gt75 = missing_values[missing_values.Perc_MissingValues >= 60]
print ("The number of columns with more that 60% of null values are: " + str(nulls_gt75["Perc_MissingValues"].count()) )
The Strategy
I am going to drop those columns with more than 60% of nulls values as I mentioned above. The reasons:
a) In this case I am going to follow the 60/40 relation. Above 60% of missing data any modification doing on the data to fill those nulls values might add bias in the variable
b) On the other hand, columns with nulls values above 60% is only one, and considering the amount of variables / analysis time / no input on the whole understanding of the data, I am going to proceed to drop those mentioned.
df_c.drop(['d1205'], axis=1, inplace = True)
datasetdf[datasetdf['varcode'].isin(obj_var_metadf)]
def barplot_labels(df, col, most_common, plot_name):
sns.set_style('whitegrid')
sns.set_palette(SETCOL1)
sns.set(font_scale=1.3)
plt.figure(figsize=(10, 7), dpi=80)
tc = Counter(df[col].tolist()).most_common(most_common)
tc_index = [table[0] for table in tc]
tc_values = [table[1] for table in tc]
ax = sns.barplot(x = tc_values , y = tc_index, orient = 'h')
ax.bar_label(ax.containers[0])
plt.title(plot_name)
plt.xlabel('Counter')
plt.ylabel(plot_name)
object_data = pandas.DataFrame(df_c.select_dtypes('object').apply(pandas.Series.nunique, axis = 0))
object_data = object_data.rename(columns = {0 : 'Sum_Objtype'})
#object_data
Obj_data_sort = object_data.sort_values('Sum_Objtype', ascending=False)
Obj_data_sort.head(7)
Observation:
Those categorical variables on the dataset are tricky. I mean, the really only variable that cointain strings is c0039. However, the other variables define a category with "int" numbers. As I don't know the origen of the data, I am asuming this data has some transformation as was describe on the desciption of the challange. Therefore, I will keep those variables as they are.
barplot_labels(df_c, 'c0039', 8, 'c0039: Industry Code')
Observation: As the category K6491, K6420, and K6511 are less 0.1% of the total dataset, I decided to group them to reduce the variable categorical to 6 category for the model.
def c0039_regroup (a):
if a == 'K6511':
return 'K_others'
elif a == 'K6420':
return 'K_others'
elif a == 'K6491':
return 'K_others'
elif a == 'K6511':
return 'K_others'
else:
return a
df_c['c0039'] = df_c['c0039'].apply(c0039_regroup)
barplot_labels(df_c, 'c0039', 8, 'c0039: Industry Code')
df_c.d0012.unique()
barplot_labels(df_c, 'd0012', 18, 'd0012: Last Original Closing Code Of Customer')
df_c.c0044.unique()
barplot_labels(df_c, 'c0044', 20, 'c0044: Client Name')
Observations:
Regarding the "Client Name" variable, it is showing five clients that got more data sampling. Don't know the reason.
The variable "Last Original Closing Code Of Customer", it is not clear on meaning, however it is clear the dominance of the category 4, 1 and 7.
datasetdf[datasetdf['varcode'].isin(num_var_metadf)]
missing_values.style.background_gradient(cmap='Reds')
displot_with_stats(df_c, 'c0001', "#6dccda" )
fig, ax = plt.subplots(figsize=(8,4))
sns.set(font_scale=1.5)
box_plot = sns.boxplot(x = 'c0039', y = 'c0001', data = df_c, palette=SETCOL1)
medians = round(df_c.groupby(['c0039'])['c0001'].median(),2)
vertical_offset = round(df_c['c0001'].median() * 0.05, 2)# offset from median for display
for xtick in box_plot.get_xticks():
box_plot.text(xtick, medians.iloc[xtick] + vertical_offset,medians.iloc[xtick],
horizontalalignment = 'center', size = 'x-small', color='w')#, weight = 'semibold')
Observations:
The variable presents outlayer that will be trim.
The NaN will be inpute base on the mean by category on the variable c0039.
#keep data samples only < 50
df_c['c0001'].values[df_c['c0001'] >= 48000] = 48000
def impute_c0001_by_c0039(cols):
c0001 = cols[0]
c0039 = cols[1]
if pandas.isnull(c0001):
if c0039 == 'K6622':
return round(df_c[df_c['c0039'] == 'K6622']['c0001'].mean(),2)
elif c0039 == 'K6512':
return round(df_c[df_c['c0039'] == 'K6512']['c0001'].mean(),2)
elif c0039 == 'K6499':
return round(df_c[df_c['c0039'] == 'K6499']['c0001'].mean(),2)
elif c0039 == 'K6619':
return round(df_c[df_c['c0039'] == 'K6619']['c0001'].mean(),2)
elif c0039 == 'K6419':
return round(df_c[df_c['c0039'] == 'K6419']['c0001'].mean(),2)
elif c0039 == 'K_others':
return round(df_c[df_c['c0039'] == 'K_others']['c0001'].mean(),2)
else:
return c0001
df_c['c0001'] = df_c[['c0001','c0039']].apply(impute_c0001_by_c0039,axis=1)
df_c['c0001'].isnull().sum()
displot_with_stats(df_c, 'c0001', "#6dccda" )
df_c['a8404'].isnull().sum()
displot_with_stats(df_c, 'a8404', "#6dccda" )
Observations:
The NaN will be inpute base on the mean by category on the variable c0039.
def impute_a8404_by_c0039(cols):
a8404 = cols[0]
c0039 = cols[1]
if pandas.isnull(a8404):
if c0039 == 'K6622':
return 0.18
elif c0039 == 'K6512':
return 0.25
elif c0039 == 'K6499':
return 0.22
elif c0039 == 'K6619':
return 0.24
elif c0039 == 'K_others':
return 0.2
elif c0039 == 'K6419':
return 0.3
else:
return a8404
df_c['a8404'] = df_c[['a8404','c0039']].apply(impute_a8404_by_c0039,axis=1)
df_c['a8404'].isnull().sum()
displot_with_stats(df_c, 'a8404', "#6dccda" )
dfindex = df_c.set_index('kdate')
dfindex.head(4)
dfindex['d0031'][75:85]
displot_with_stats(dfindex, 'd0031', "#6dccda" )
fig, ax = plt.subplots(figsize=(8,4))
sns.set(font_scale=1.5)
box_plot = sns.boxplot(x = 'month', y = 'd0031', data = dfindex, palette=SETCOL1)
medians = round(dfindex.groupby(['month'])['d0031'].mean(),2)
vertical_offset = round(dfindex['d0031'].median() * 0.05, 2)# offset from median for display
for xtick in box_plot.get_xticks():
box_plot.text(xtick, medians.iloc[xtick] + vertical_offset,medians.iloc[xtick],
horizontalalignment = 'center', size = 'x-small', color='w')#, weight = 'semibold')
Observations:
For this variable and the next ones, the technic select for NaN imputations is different to the variables analyses above. The reaosn is that same technics were applied and the data distribution changes, therefore I decided to applied other techniques that as first approach don't checnde the data distribution.
The technic that works better after several test was the Imputing using ffill, replace NaN s with last observed value. This technic is very used on time series problems, but I decided to implemented here due to the bounded relation btw variables.
dfindex['d0031'][75:85]
displot_with_stats(dfindex, 'd0031', "#6dccda" )
displot_with_stats(dfindex, 'c9008', "#6dccda" )
fig, ax = plt.subplots(figsize=(8,3))
sns.set(font_scale=1.5)
box_plot = sns.boxplot(x = 'c0039', y = 'c9008', data = dfindex, palette=SETCOL1)
medians = round(dfindex.groupby(['c0039'])['c9008'].median(),2)
vertical_offset = round(dfindex['c0001'].median() * 0.05, 2)# offset from median for display
for xtick in box_plot.get_xticks():
box_plot.text(xtick, medians.iloc[xtick] + vertical_offset,medians.iloc[xtick],
horizontalalignment = 'center', size = 'x-small', color='w')#, weight = 'semibold')
dfindex['c9008'].isnull().sum()
dfindex['c9008'][701:714]
def impute_c9008_by_c0039(cols):
c9008 = cols[0]
c0039 = cols[1]
if pandas.isnull(c9008):
if c0039 == 'K6622':
return round(df_c[df_c['c0039'] == 'K6622']['c9008'].mean(),2)
elif c0039 == 'K6512':
return round(df_c[df_c['c0039'] == 'K6512']['c9008'].mean(),2)
elif c0039 == 'K6499':
return round(df_c[df_c['c0039'] == 'K6499']['c9008'].mean(),2)
elif c0039 == 'K6619':
return round(df_c[df_c['c0039'] == 'K6619']['c9008'].mean(),2)
elif c0039 == 'K6419':
return round(df_c[df_c['c0039'] == 'K6419']['c9008'].mean(),2)
elif c0039 == 'K_others':
return round(df_c[df_c['c0039'] == 'K_others']['c9008'].mean(),2)
else:
return c9008
dfindex['c9008'] = dfindex[['c9008','c0039']].apply(impute_c9008_by_c0039,axis=1)
dfindex['c9008'].isnull().sum()
displot_with_stats(dfindex, 'b0007', "#6dccda" )
fig, ax = plt.subplots(figsize=(15,8))
sns.set(font_scale=1.5)
box_plot = sns.boxplot(x = 'month', y = 'b0007', data = dfindex, palette=SETCOL1)
medians = round(dfindex.groupby(['month'])['b0007'].mean(),2)
vertical_offset = round(dfindex['b0007'].median() * 0.05, 2)# offset from median for display
for xtick in box_plot.get_xticks():
box_plot.text(xtick, medians.iloc[xtick] + vertical_offset,medians.iloc[xtick],
horizontalalignment = 'center', size = 'x-small', color='w')#, weight = 'semibold')
print(dfindex['b0007'].quantile(0.10))
print(dfindex['b0007'].quantile(0.90))
#df_c['b0007'] = numpy.where(df_c['b0007'] <2960.0, 2960.0,df['Income'])
dfindex['b0007'] = numpy.where(dfindex['b0007'] >1001, 1000.52,dfindex['b0007'])
print(dfindex['b0007'].skew())
displot_with_stats(dfindex, 'b0007', "#6dccda" )
dfindex['b0007'][75:85]
dfindex['b0007'].fillna(method='bfill',inplace=True)
dfindex['b0007'][75:85]
displot_with_stats(dfindex, 'b0007', "#6dccda" )
displot_with_stats(dfindex, 'd2112', "#6dccda" )
fig, ax = plt.subplots(figsize=(15,8))
sns.set(font_scale=1.5)
box_plot = sns.boxplot(x = 'month', y = 'd2112', data = dfindex, palette=SETCOL1)
medians = round(dfindex.groupby(['month'])['d2112'].mean(),2)
vertical_offset = round(dfindex['d2112'].median() * 0.05, 2)# offset from median for display
for xtick in box_plot.get_xticks():
box_plot.text(xtick, medians.iloc[xtick] + vertical_offset,medians.iloc[xtick],
horizontalalignment = 'center', size = 'x-small', color='w')#, weight = 'semibold')
dfindex['d2112'][75:85]
dfindex['d2112'].fillna(method='bfill',inplace=True)
dfindex['d2112'][75:85]
The strategy:
At this point, I going to check how the variables selected so far are correlated with the target variable. Any variable that has a high corr with another variable or with target will be drop for the dataset input or modeling.
all_var_metadf
df = dfindex.copy()
df.drop(['case_id','kdate_ori','keydate'], axis=1, inplace = True)
df.isnull().sum()
# Correlation btw variables
fig = plt.figure(figsize=(10,7))
sns.heatmap(df.corr())
plt.title("Correlation Heatmap Data")
plt.show()
#expt = df_clean.set_experiment("Correlated features")
# corr matrix
cor = df.corr()
cor.loc[:,:] = numpy.tril(cor, k=-1) # below main lower triangle of an array
cor_stack = cor.stack()
print("Columns with corr. greater than 0.75 - ")
print(cor_stack[(cor_stack > 0.75) | (cor_stack < -0.75)])
df_c['ct090'].unique
df.drop(['c0001'], axis=1, inplace = True)
all_var_metadf, obj_var_metadf, num_var_metadf = columns_as_list(df)
datasetdf[datasetdf['varcode'].isin(all_var_metadf)]
for i, col in enumerate(all_var_metadf):
sns.set_style('whitegrid')
sns.set_palette(SETCOL1)
print("\n----------------------------------------------------------------")
print("Analysis on the variable c0015: AmountOfCase and c0015: AmountofCase")
print(f"var #{i+1}: {col}\n")
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=[6, 2])
sns.set(font_scale=0.6)
sns.scatterplot(x=df['c0015'], y=df[col], ax=ax1, hue = df['ct090'] )
sns.scatterplot(x=df['c0031'], y=df[col], ax=ax2, hue = df['ct090'])
label_font = {'size':'11'}
plt.show()
for i, col in enumerate(all_var_metadf):
sns.set_style('whitegrid')
sns.set_palette(SETCOL1)
print("\n----------------------------------------------------------------")
print("Analysis on the variable c0019: NumberOfTelephonesCalls and month")
print(f"var #{i+1}: {col}\n")
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=[6, 2])
sns.set(font_scale=0.6)
sns.scatterplot(x=df['c0019'], y=df[col], ax=ax1, hue = df['ct090'] )
sns.scatterplot(x=df['month'], y=df[col], ax=ax2, hue = df['ct090'])
label_font = {'size':'13'}
plt.show()
Observations:
It is clear that the dataset is unbalanced.
There is not a clear relationship btw the analysed variables and in some of them datasample are concentrated on some range.
Regarding the variable month, it can be apreciated that not sample point with target = 1 was displayed. For example, with the variable c0015: Amount of case, the month 2, 3 and 8 shows pretty low or none target variable to 1. Therefore, a hyphotesis can be rise as from the month to the 90 days we have the months 5 (2+3), 6(3+3), 12(8+3) considering as the months less ideal to collect debt, or the the strategies applies are less efficeints.
plt.figure(figsize=(12,3))
df.corr()['ct090'].sort_values().drop('ct090').plot(kind='bar')
cor_stack_1 = df.corr()['ct090'].sort_values().drop('ct090')
cor_stack_1
Observation:
There is not variable with high correlation against the target, therefore none will be drop under this premise.
A dataframe clean is exported after doing some transformation to adapt the data for modeling. This will set on ...data/processed path.
df.head(3)
df_clean = df.copy()
df_clean.reset_index()
df_clean.to_csv((config_output + "df_clean.csv"))